In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
import sklearn
import scipy


sns.set()
In [10]:
path="Diabetes Prediction.csv"
data = pd.read_csv(path)
data.head()
Out[10]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [11]:
data.shape
Out[11]:
(768, 9)
In [12]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [13]:
data.describe().T
Out[13]:
count mean std min 25% 50% 75% max
Pregnancies 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00
Glucose 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00
BloodPressure 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00
SkinThickness 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 99.00
Insulin 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 846.00
BMI 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10
DiabetesPedigreeFunction 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42
Age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00
Outcome 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00
In [14]:
data_feature = data.columns

for feature in data_feature:
    p = sns.distplot(a = data[feature])
    plt.show()
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
C:\Users\Lenovo\AppData\Local\Temp\ipykernel_3860\4286188437.py:4: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p = sns.distplot(a = data[feature])
No description has been provided for this image
In [15]:
data_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
In [16]:
data[data_zeros] = np.where((data[data_zeros] == 0), np.nan, data[data_zeros])
In [17]:
data.isnull().sum()
Out[17]:
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64
In [18]:
p = data.hist(figsize = (20,20))
No description has been provided for this image
In [19]:
data.describe().T
Out[19]:
count mean std min 25% 50% 75% max
Pregnancies 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00
Glucose 763.0 121.686763 30.535641 44.000 99.00000 117.0000 141.00000 199.00
BloodPressure 733.0 72.405184 12.382158 24.000 64.00000 72.0000 80.00000 122.00
SkinThickness 541.0 29.153420 10.476982 7.000 22.00000 29.0000 36.00000 99.00
Insulin 394.0 155.548223 118.775855 14.000 76.25000 125.0000 190.00000 846.00
BMI 757.0 32.457464 6.924988 18.200 27.50000 32.3000 36.60000 67.10
DiabetesPedigreeFunction 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42
Age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00
Outcome 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00
In [20]:
sns.pairplot(data,diag_kind='kde');
No description has been provided for this image
In [21]:
data['Glucose'] = data['Glucose'].fillna(data['Glucose'].mean())
In [22]:
data['BloodPressure'] = data['BloodPressure'].fillna(data['BloodPressure'].mean())
In [23]:
sns.boxplot(y = 'SkinThickness', data = data)
Out[23]:
<Axes: ylabel='SkinThickness'>
No description has been provided for this image
In [24]:
data['SkinThickness'].mean(), data['SkinThickness'].median()
Out[24]:
(np.float64(29.153419593345657), np.float64(29.0))
In [25]:
data['SkinThickness'] = data['SkinThickness'].fillna(data['SkinThickness'].median())
In [26]:
data['Insulin'].mean(), data['Insulin'].median()
Out[26]:
(np.float64(155.5482233502538), np.float64(125.0))
In [27]:
data['Insulin'] = data['Insulin'].fillna(data['Insulin'].median())
In [28]:
data['BMI'].mean(), data['BMI'].median()
Out[28]:
(np.float64(32.457463672391015), np.float64(32.3))
In [29]:
data['BMI'] = data['BMI'].fillna(data['BMI'].median())
In [30]:
for i in range(9):
    print(data.columns[i])
     
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Outcome
In [31]:
p = data.hist(figsize = (20,20))
No description has been provided for this image
In [32]:
sns.pairplot(data =data, hue = 'Outcome')
plt.show()
No description has been provided for this image
In [33]:
plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot = True, cmap = "YlGnBu")
plt.show()
No description has been provided for this image
In [34]:
from scipy import stats
for feature in data.columns:
    stats.probplot(data[feature], plot = plt)
    plt.title(feature)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [35]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
In [36]:
data.head()
Out[36]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148.0 72.0 35.0 125.0 33.6 0.627 50 1
1 1 85.0 66.0 29.0 125.0 26.6 0.351 31 0
2 8 183.0 64.0 29.0 125.0 23.3 0.672 32 1
3 1 89.0 66.0 23.0 94.0 28.1 0.167 21 0
4 0 137.0 40.0 35.0 168.0 43.1 2.288 33 1
In [37]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
In [38]:
X.head()
Out[38]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 6 148.0 72.0 35.0 125.0 33.6 0.627 50
1 1 85.0 66.0 29.0 125.0 26.6 0.351 31
2 8 183.0 64.0 29.0 125.0 23.3 0.672 32
3 1 89.0 66.0 23.0 94.0 28.1 0.167 21
4 0 137.0 40.0 35.0 168.0 43.1 2.288 33
In [39]:
y.head()
     
Out[39]:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64
In [40]:
X.head()
Out[40]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 6 148.0 72.0 35.0 125.0 33.6 0.627 50
1 1 85.0 66.0 29.0 125.0 26.6 0.351 31
2 8 183.0 64.0 29.0 125.0 23.3 0.672 32
3 1 89.0 66.0 23.0 94.0 28.1 0.167 21
4 0 137.0 40.0 35.0 168.0 43.1 2.288 33
In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
In [42]:
def svm_classifier(X_train, X_test, y_train, y_test):

    classifier_svm = SVC(kernel = 'rbf', random_state = 0)
    classifier_svm.fit(X_train, y_train)

    y_pred = classifier_svm.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)

    return print(f"Train score : {classifier_svm.score(X_train, y_train)}\nTest score : {classifier_svm.score(X_test, y_test)}")
     
In [43]:
def knn_classifier(X_train, X_test, y_train, y_test):

    classifier_knn = KNeighborsClassifier(metric = 'minkowski', p = 2)
    classifier_knn.fit(X_train, y_train)

    y_pred = classifier_knn.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)

    return print(f"Train score : {classifier_knn.score(X_train, y_train)}\nTest score : {classifier_knn.score(X_test, y_test)}")
     
In [44]:
def naive_classifier(X_train, X_test, y_train, y_test):

    classifier_naive = GaussianNB()
    classifier_naive.fit(X_train, y_train)

    y_pred = classifier_naive.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)

    return print(f"Train score : {classifier_naive.score(X_train, y_train)}\nTest score : {classifier_naive.score(X_test, y_test)}")
     
In [45]:
def tree_classifier(X_train, X_test, y_train, y_test):

    classifier_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier_tree.fit(X_train, y_train)

    y_pred = classifier_tree.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)

    return print(f"Train score : {classifier_tree.score(X_train, y_train)}\nTest score : {classifier_tree.score(X_test, y_test)}")

     
In [46]:
def forest_classifier(X_train, X_test, y_train, y_test):
    classifier_forest = RandomForestClassifier(criterion = 'entropy', random_state = 0)
    classifier_forest.fit(X_train, y_train)

    y_pred = classifier_forest.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)

    return print(f"Train score : {classifier_forest.score(X_train, y_train)}\nTest score : {classifier_forest.score(X_test, y_test)}")

     
In [47]:
def print_score(X_train, X_test, y_train, y_test):
    print("SVM:\n")
    svm_classifier(X_train, X_test, y_train, y_test)

    print("-"*100)
    print()

    print("KNN:\n")
    knn_classifier(X_train, X_test, y_train, y_test)

    print("-"*100)
    print()

    print("Naive:\n")
    naive_classifier(X_train, X_test, y_train, y_test)

    print("-"*100)
    print()

    print("Decision Tree:\n")
    tree_classifier(X_train, X_test, y_train, y_test)

    print("-"*100)
    print()

    print("Random Forest:\n")
    forest_classifier(X_train, X_test, y_train, y_test)
In [48]:
print_score(X_train, X_test, y_train, y_test)
SVM:

Train score : 0.758957654723127
Test score : 0.7922077922077922
----------------------------------------------------------------------------------------------------

KNN:

Train score : 0.8013029315960912
Test score : 0.7662337662337663
----------------------------------------------------------------------------------------------------

Naive:

Train score : 0.745928338762215
Test score : 0.7857142857142857
----------------------------------------------------------------------------------------------------

Decision Tree:

Train score : 1.0
Test score : 0.6883116883116883
----------------------------------------------------------------------------------------------------

Random Forest:

Train score : 1.0
Test score : 0.8116883116883117
In [49]:
classifier_forest = RandomForestClassifier(criterion = 'entropy')
classifier_forest.fit(X_train, y_train)
y_pred = classifier_forest.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm
Out[49]:
array([[93, 14],
       [17, 30]])
In [50]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
Out[50]:
Predicted 0 1 All
True
0 93 14 107
1 17 30 47
All 110 44 154
In [51]:
data['Outcome'].value_counts()
Out[51]:
Outcome
0    500
1    268
Name: count, dtype: int64
In [52]:
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
In [53]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       107
           1       0.68      0.64      0.66        47

    accuracy                           0.80       154
   macro avg       0.76      0.75      0.76       154
weighted avg       0.80      0.80      0.80       154

In [54]:
y_pred_prob = classifier_forest.predict_proba(X_test)[:,1]
y_pred_prob
     
Out[54]:
array([0.91, 0.2 , 0.09, 0.63, 0.07, 0.02, 0.8 , 0.87, 0.26, 0.43, 0.73,
       0.79, 0.16, 0.15, 0.35, 0.41, 0.83, 0.01, 0.51, 0.21, 0.67, 0.14,
       0.06, 0.24, 0.02, 0.3 , 0.02, 0.86, 0.  , 0.09, 0.4 , 0.25, 0.23,
       0.65, 0.07, 0.74, 0.43, 0.02, 0.21, 0.7 , 0.23, 0.09, 0.13, 0.84,
       0.59, 0.08, 0.07, 0.12, 0.32, 0.23, 0.41, 0.16, 0.84, 0.63, 0.21,
       0.05, 0.16, 0.36, 0.25, 0.51, 0.66, 0.7 , 0.05, 0.68, 0.9 , 0.49,
       0.66, 0.11, 0.7 , 0.26, 0.05, 0.2 , 0.09, 0.79, 0.89, 0.52, 0.11,
       0.6 , 0.3 , 0.28, 0.42, 0.43, 0.12, 0.01, 0.22, 0.15, 0.07, 0.29,
       0.9 , 0.09, 0.31, 0.24, 0.11, 0.02, 0.66, 0.13, 0.27, 0.42, 0.39,
       0.54, 0.16, 0.  , 0.14, 0.04, 0.63, 0.63, 0.1 , 0.62, 0.04, 0.49,
       0.02, 0.55, 0.59, 0.42, 0.69, 0.63, 0.05, 0.35, 0.1 , 0.82, 0.38,
       0.38, 0.2 , 0.3 , 0.08, 0.  , 0.28, 0.43, 0.46, 0.4 , 0.43, 0.35,
       0.03, 0.73, 0.22, 0.66, 0.28, 0.67, 0.43, 0.16, 0.06, 0.8 , 0.01,
       0.18, 0.75, 0.  , 0.07, 0.1 , 0.07, 0.31, 0.06, 0.31, 0.04, 0.19])
In [55]:
fpr, tpr, threshold = roc_curve(y_test, y_pred_prob)
print("FPR:\n\n", fpr)


print("-"*100)

print("TPR:\n\n", tpr)
FPR:

 [0.         0.         0.         0.         0.00934579 0.00934579
 0.00934579 0.02803738 0.02803738 0.02803738 0.05607477 0.05607477
 0.05607477 0.06542056 0.07476636 0.09345794 0.11214953 0.11214953
 0.11214953 0.12149533 0.13084112 0.13084112 0.14018692 0.1682243
 0.18691589 0.22429907 0.22429907 0.24299065 0.24299065 0.27102804
 0.27102804 0.28037383 0.30841121 0.31775701 0.34579439 0.34579439
 0.36448598 0.37383178 0.39252336 0.41121495 0.42990654 0.48598131
 0.5046729  0.55140187 0.57009346 0.57943925 0.59813084 0.60747664
 0.6635514  0.71028037 0.72897196 0.77570093 0.80373832 0.8411215
 0.86915888 0.87850467 0.93457944 0.96261682 1.        ]
----------------------------------------------------------------------------------------------------
TPR:

 [0.         0.0212766  0.06382979 0.10638298 0.10638298 0.14893617
 0.19148936 0.23404255 0.27659574 0.31914894 0.31914894 0.36170213
 0.40425532 0.46808511 0.46808511 0.53191489 0.53191489 0.57446809
 0.61702128 0.61702128 0.63829787 0.68085106 0.68085106 0.74468085
 0.76595745 0.76595745 0.78723404 0.78723404 0.80851064 0.80851064
 0.82978723 0.87234043 0.87234043 0.87234043 0.87234043 0.89361702
 0.89361702 0.91489362 0.91489362 0.93617021 0.93617021 0.93617021
 0.93617021 0.93617021 0.93617021 0.95744681 0.95744681 0.9787234
 0.9787234  0.9787234  0.9787234  1.         1.         1.
 1.         1.         1.         1.         1.        ]
In [56]:
plt.plot([0, 1], [0, 1], "k--", label = '50% AUC')
plt.plot(fpr, tpr, label = "Random Forest")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve - Random Forest")
plt.show()
No description has been provided for this image
In [57]:
roc_auc_score(y_test,y_pred_prob)
Out[57]:
np.float64(0.8583217339431298)
In [58]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_forest, X = X_train, y = y_train, cv = 10)
print(accuracies.mean(), accuracies.std())
0.7491274457958752 0.04828104302127529
In [59]:
from sklearn.model_selection import GridSearchCV
In [60]:
parameters = {
    'n_estimators': [25, 50, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [14, 20, 25, 30]
}
    
In [61]:
grid_search = GridSearchCV(estimator = classifier_forest,
                          param_grid = parameters,
                          scoring = 'accuracy',
                          cv = 10,
                          n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
print('best_accuracy = ',grid_search.best_score_)
print('best_parameters = ', grid_search.best_params_)
best_accuracy =  0.7606292966684294
best_parameters =  {'criterion': 'gini', 'max_depth': 14, 'n_estimators': 300}
In [62]:
classifier_forest = RandomForestClassifier(criterion = 'gini', max_depth = 25, n_estimators = 200, random_state = 0)
classifier_forest.fit(X_train, y_train)
y_pred = classifier_forest.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm
Out[62]:
array([[94, 13],
       [13, 34]])
In [63]:
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.88      0.88      0.88       107
           1       0.72      0.72      0.72        47

    accuracy                           0.83       154
   macro avg       0.80      0.80      0.80       154
weighted avg       0.83      0.83      0.83       154

In [64]:
confusion_matrix(y_test, y_pred)
Out[64]:
array([[94, 13],
       [13, 34]])
In [ ]: